## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
e <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_anthropometric_all.xlsx")

#Dealing with NAs
ena<-na.omit(e)

#Renaming Variables(Renaming Column Names for easy code handling and Cleaning Data)
names(ena)[names(ena)=='measurement date']<- 'measurement_date'
names(ena)[names(ena)=='age (years)']<- 'age_years'
names(ena)[names(ena)=='age bin']<- 'age_bin'
names(ena)[length(ena)]<-"z_cat_WHO"
names(ena)[9]<-"z_score_WHO"
names(ena)[6]<-"height_cm"
names(ena)[7]<-"weight_kg"

#Removing "NA" Characters
ena$observation <- 1:nrow(ena)
x<-ena[ena$z_cat_WHO=="NA",]
y<-x$observation
ena<-ena[-y,]

#Adding additional column
ena$year <- year(ena$measurement_date)
ena$ID <- as.factor(ena$ID)
#Changing Data Types
ena <- ena %>%
  mutate(gender = as.factor(gender),
         z_cat_WHO=as.factor(z_cat_WHO),
         measurement_date=as.Date(measurement_date),
         BMI = as.numeric(BMI),
         z_score_WHO=as.numeric(z_score_WHO),
         height_cm=as.numeric(height_cm),
         weight_kg=as.numeric(weight_kg))

Diving data as per Year

ena_2007 <- filter(ena, year(measurement_date) == 2007)
ena_2008 <- filter(ena, year(measurement_date) == 2008)
ena_2009 <- filter(ena, year(measurement_date) == 2009)
ena_2010 <- filter(ena, year(measurement_date) == 2010)
ena_2011 <- filter(ena, year(measurement_date) == 2011)
ena_2012 <- filter(ena, year(measurement_date) == 2012)
ena_2013 <- filter(ena, year(measurement_date) == 2013)
ena_2014 <- filter(ena, year(measurement_date) == 2014)
ena_2015 <- filter(ena, year(measurement_date) == 2015)
ena_2016 <- filter(ena, year(measurement_date) == 2016)
ena_2017 <- filter(ena, year(measurement_date) == 2017)
ena_2018 <- filter(ena, year(measurement_date) == 2018)

Checking uniques IDs whose observation has been taken continuously from 2007 to 2018

#kids data of 2007 whose obeservation were taken till the end of 2018 without any miss.
a_07_08 <- subset(ena_2007, ID %in% ena_2008$ID)
a_07_09 <- subset(a_07_08, ID %in% ena_2009$ID)
a_07_10 <- subset(a_07_09, ID %in% ena_2010$ID)
a_07_11 <- subset(a_07_10, ID %in% ena_2011$ID)
a_07_12 <- subset(a_07_11, ID %in% ena_2012$ID)
a_07_13 <- subset(a_07_12, ID %in% ena_2013$ID)
a_07_14 <- subset(a_07_13, ID %in% ena_2014$ID)
a_07_15 <- subset(a_07_14, ID %in% ena_2015$ID)
a_07_16 <- subset(a_07_15, ID %in% ena_2016$ID)
a_07_17 <- subset(a_07_16, ID %in% ena_2017$ID)
a_07_18 <- subset(a_07_17, ID %in% ena_2018$ID)

#All data of the a_07_18 dataset kids from 2007 to 2018.
a_unique_07<- subset(ena, ID %in% a_07_18$ID)
#Checking prop increase per year
a_unique_07_pct<-a_unique_07 %>%
  group_by(ID) %>% 
  dplyr::arrange(measurement_date, .by_group = TRUE) %>%
  dplyr::mutate(pct_change_ht = (height_cm/lag(height_cm) - 1) * 100) %>% 
  dplyr::mutate(pct_change_wt = (weight_kg/lag(weight_kg) - 1) * 100) %>%
  dplyr::mutate(pct_change_bmi = (BMI/lag(BMI) - 1) * 100) %>%
  dplyr::select(ID, measurement_date, height_cm, BMI, weight_kg,pct_change_ht,pct_change_wt,pct_change_bmi,gender)

Weight Analysis

a_unique_07$is_boy <- as.factor(ifelse(a_unique_07$gender=='boy', "1", "0"))

#Boys Weight
a_unique_07_boys <- filter(a_unique_07, gender=="boy")
b<-ggplot(a_unique_07_boys, aes(x=age_years, y=weight_kg, color=ID)) + geom_line() + theme(legend.position = "none") + ggtitle("Weight of all Boys ")
ggplotly(b)
#Girls Weight
a_unique_07_girls <- filter(a_unique_07, gender=="girl")
g<-ggplot(a_unique_07_girls, aes(x=age_years, y=weight_kg, color=ID)) + geom_line() +
  theme(legend.position = "none")+ ggtitle("Weight of all Girls")
ggplotly(g)
plot_grid(b,g)

#Comparing visually we can say that the weight increase rate is more in boys as compared to girls.

gam() or bam()

There are two functions for implementing a GAMM model: gam() and bam(). There are largely similar. The most important difference is that bam() is optimized for big data sets.

Smooths terms To model a potentially nonlinear smooth or surface, three different smooth functions are available: * s() : for modeling a 1-dimensional smooth, or for modeling isotropic interactions (variables are measured in same units and on same scale)

te(): for modeling 2- or n-dimensional interaction surfaces of variables that are not isotropic (but see info about d parameter below). Includes ‘main’ effects.

ti(): for modeling 2- or n-dimensional interaction surfaces that do not include the ‘main effects’.

Parameters of smooth functions The smooth functions have several parameters that could be set to change their behavior. The most often-used parameters are listed here:

k : number of ‘knots’. This parameter determines the upper bound of the number of underlying base functions being used to build up the curve. Thus, this parameter constraints the wigglyness of a smooth, or - as a metaphor - the number of bowpoints of a curve. Note that the model will base the number of base functions (reflected in the edf of the summary) on the data with the setting for k as upper bound. By default, the value of k for s() is around 9, and for te() and ti() 5 per dimension. Importantly, the value of k should be at most one less than the number of unique data points, otherwise it will fit the density of that predictor.

d : for specifiying that predictors in the interaction are on the same scale or dimension (only used in te() and ti()). For example, in te(Time, width, height, d=c(1,2)), with width and height reflecting the picture size measured in pixels, we specify that Time is on a different dimension than the next two variables. By default, the value would be d=c(1,1,1) in this case.

bs: specifies the type of underlying base functios. For s() this defaults to “tp” (thin plate regression spline) and for te() and ti() this defaults to “cr” (cubic regression spline). For random intercepts and linear random slopes use bs=“re”, but for random smooths use bs=“fs” (see below).

Setting up a GAMM model Before the different components of a GAMM model were summarized. In this section we focus on finding the model that best fits the data.

We start with the full model. Note that we did not include interactions with the predictor Trial, because it is not the focus of our simulated experiment, and it takes much longer to run. Therefore, we only included Trial as random effect. We did not include Item effects, because these were not specified in this data set.

Random effects Three different types of random effects are distinghuished when using GAMMs:

random intercepts adjust the height of other modelterms with a constant value: s(ID, bs=“re”).

random slopes adjust the slope of the trend of a numeric predictor: s(ID, age, bs=“re”).

random smooths adjust the trend of a numeric predictor in a nonlinear way: s(age, ID, bs=“fs”, m=1).

Notes:

Random intercepts and random slopes could be combined, but the random smooths already include random intercepts and random slope effects.

The argument m=1 sets a heavier penalty for the smooth moving away from 0, causing shrinkage to the mean.

Here we are using bam as gam takes longer time for huge datasets. Bam is a similar and an optimised version of gam for huge datasets.

No Random Effect

#For Boys
model1_b <- bam(weight_kg ~ age_years
                + height_cm,
          data=a_unique_07_boys)
summary(model1_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## weight_kg ~ age_years + height_cm
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99.65606    3.33461 -29.885   <2e-16 ***
## age_years    -0.46752    0.20813  -2.246   0.0248 *  
## height_cm     0.99737    0.03536  28.207   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =   0.75   Deviance explained = 75.1%
## -REML = 5163.3  Scale est. = 103.09    n = 1381
gam.check(model1_b)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-3.962566e-08,-3.962566e-08]
## (score 5163.302 & scale 103.0917).
## Hessian positive definite, eigenvalue range [689,689].
## Model rank =  3 / 3

Random intercepts Effect

model2_b <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re"),
          data=a_unique_07_boys)
#summary(model2_b)
model2_b_1<-getViz(model2_b)
print(plot(model2_b_1, allTerms = T), pages = 1)

gam.check(model2_b)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-1.770759e-06,1.653528e-06]
## (score 4386.689 & scale 27.47637).
## Hessian positive definite, eigenvalue range [29.20189,690.505].
## Model rank =  68 / 68 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##       k' edf k-index p-value
## s(ID) 65  63      NA      NA

Random intercepts + slopes Effect

model3_b <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re")
                + s(ID, age_years, bs="re"),
          data=a_unique_07_boys)
#summary(model3_b)
model3_b_1<-getViz(model3_b)
print(plot(model3_b_1, allTerms = T), pages = 1)

plot(model3_b)

gam.check(model3_b)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-5.533728e-05,4.951273e-05]
## (score 3928.869 & scale 11.49162).
## Hessian positive definite, eigenvalue range [23.07729,691.7442].
## Model rank =  133 / 133 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'  edf k-index p-value
## s(ID)           65.0 59.0      NA      NA
## s(ID,age_years) 65.0 61.8      NA      NA

Comparing Models

library(itsadug)
compareML(model2_b,model3_b)
## model2_b: weight_kg ~ age_years + height_cm + s(ID, bs = "re")
## 
## model3_b: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
##  model2_b: weight_kg ~ age_years + height_cm + s(ID, bs = "re")
## 
## model3_b:     age_years, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##          Model    Score Edf   Chisq    Df  p.value Sig.
##       model2_b 4386.689   4                            
## fREML model3_b 3928.869   5 457.820 1.000  < 2e-16  ***
## 
## AIC difference: 1149.86, model model3_b has lower AIC.

Random intercepts + slopes + smooths Effect

model4_b <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re")
                + s(ID, age_years, bs="re")
                + s(age_years, ID, bs="fs", m=1),
          data=a_unique_07_boys)
#summary(model4_b)

model4_b_1<-getViz(model4_b)
print(plot(model4_b_1, allTerms = T), pages = 1)

#vis.gam(model4_b,theta=30,ticktype="detailed")
#vis.gam(model4_b,theta=-45,ticktype="detailed",se=2)
#vis.gam(model4_b,plot.type="contour")
compareML(model3_b, model4_b)
## model3_b: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
## 
## model4_b: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
##  model3_b:     age_years, bs = "re")
## 
## model4_b:     age_years, bs = "re") + s(age_years, ID, bs = "fs", m = 1)
## 
## Chi-square test of fREML scores
## -----
##          Model    Score Edf   Chisq    Df  p.value Sig.
##       model3_b 3928.869   5                            
## fREML model4_b 3488.335   7 440.534 2.000  < 2e-16  ***
## 
## AIC difference: 1468.35, model model4_b has lower AIC.

We can conclude that Model 4 is best which explains boys weights.

Checking Auto Correlation Function

par(mfrow=c(1,3), cex=1.1)
acf_resid(model2_b, split_pred="ID", main="ACF resid(model2)")
acf_resid(model3_b, split_pred="ID", main="ACF resid(model3)")
acf_resid(model4_b, split_pred="ID", main="ACF resid(model4)")

No Random Effect

#For Girls
model1_g <- bam(weight_kg ~ age_years
                + height_cm,
          data=a_unique_07_girls)
summary(model1_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## weight_kg ~ age_years + height_cm
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -72.34644    3.38260  -21.39  < 2e-16 ***
## age_years     0.96514    0.13788    7.00 3.74e-12 ***
## height_cm     0.71406    0.03104   23.01  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.648   Deviance explained = 64.9%
## -REML = 5958.1  Scale est. = 90.849    n = 1621
gam.check(model1_g)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-0.002154512,-0.002154512]
## (score 5958.075 & scale 90.8494).
## Hessian positive definite, eigenvalue range [809.0022,809.0022].
## Model rank =  3 / 3

Random intercepts Effect

model2_g <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re"),
          data=a_unique_07_girls)
#summary(model2_g)
model2_g_1<-getViz(model2_g)
print(plot(model2_g_1, allTerms = T), pages = 1)

gam.check(model2_g)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-5.433067e-06,8.541277e-07]
## (score 4932.433 & scale 20.95561).
## Hessian positive definite, eigenvalue range [34.70023,810.7665].
## Model rank =  79 / 79 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##       k' edf k-index p-value
## s(ID) 76  74      NA      NA

Random intercepts + slopes Effect

model3_g <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re")
                + s(ID, age_years, bs="re"),
          data=a_unique_07_girls)
#summary(model3_b)
model3_g_1<-getViz(model3_g)
print(plot(model2_g_1, allTerms = T), pages = 1)

gam.check(model3_g)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-4.772119e-09,7.423751e-11]
## (score 4476.055 & scale 9.931909).
## Hessian positive definite, eigenvalue range [25.97655,812.1259].
## Model rank =  155 / 155 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'  edf k-index p-value
## s(ID)           76.0 68.0      NA      NA
## s(ID,age_years) 76.0 71.9      NA      NA

Comparing Models

compareML(model2_g,model3_g)
## model2_g: weight_kg ~ age_years + height_cm + s(ID, bs = "re")
## 
## model3_g: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
##  model2_g: weight_kg ~ age_years + height_cm + s(ID, bs = "re")
## 
## model3_g:     age_years, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##          Model    Score Edf   Chisq    Df  p.value Sig.
##       model2_g 4932.433   4                            
## fREML model3_g 4476.055   5 456.377 1.000  < 2e-16  ***
## 
## AIC difference: 1148.71, model model3_g has lower AIC.

Random intercepts + slopes + smooths Effect

model4_g <- bam(weight_kg ~ age_years
                + height_cm
                + s(ID, bs="re")
                + s(ID, age_years, bs="re")
                + s(age_years, ID, bs="fs", m=1),
          data=a_unique_07_girls)
#summary(model4_b)
model4_g_1<-getViz(model4_g)
print(plot(model4_g_1, allTerms = T), pages = 1)

#vis.gam(model4_g,theta=30,ticktype="detailed")
#vis.gam(model4_g,theta=-45,ticktype="detailed",se=2)
#vis.gam(model4_g,plot.type="contour")
compareML(model3_g, model4_g)
## model3_g: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
## 
## model4_g: weight_kg ~ age_years + height_cm + s(ID, bs = "re") + s(ID, 
##  model3_g:     age_years, bs = "re")
## 
## model4_g:     age_years, bs = "re") + s(age_years, ID, bs = "fs", m = 1)
## 
## Chi-square test of fREML scores
## -----
##          Model    Score Edf   Chisq    Df  p.value Sig.
##       model3_g 4476.055   5                            
## fREML model4_g 3969.953   7 506.103 2.000  < 2e-16  ***
## 
## AIC difference: 1725.90, model model4_g has lower AIC.

We can conclude that Model 4 is best which explains girls weights.

Checking Auto Correlation Function

par(mfrow=c(1,3), cex=1.1)
acf_resid(model2_g, split_pred="ID", main="ACF resid(model2)")
acf_resid(model3_g, split_pred="ID", main="ACF resid(model3)")
acf_resid(model4_g, split_pred="ID", main="ACF resid(model4)")

#Boys 
#Weight Prediction using the model 
a_unique_07_boys$pred_wt<-predict(model4_b,a_unique_07_boys, type='response')

#Comparing Actual and Predicted weight for Boys
b_act<-ggplot(a_unique_07_boys,aes(x=measurement_date, y=weight_kg, group=ID))+
  geom_line()+ggtitle("Actual Weights (Boys)")

b_pred<-ggplot(a_unique_07_boys,aes(x=measurement_date, y=pred_wt, group=ID))+
  geom_line()+ggtitle("Predicted Weights (Boys)")

plot_grid(b_act,b_pred)

#Girls 
#Weight Prediction using the model 
a_unique_07_girls$pred_wt<-predict(model4_g,a_unique_07_girls, type='response')

#Comparing Actual and Predicted weight for Girls
g_act<-ggplot(a_unique_07_girls,aes(x=measurement_date, y=weight_kg, group=ID))+
  geom_line()+ggtitle("Actual Weights (Girls)")

g_pred<-ggplot(a_unique_07_girls,aes(x=measurement_date, y=pred_wt, group=ID))+
  geom_line()+ggtitle("Predicted Weights (Girls)")

plot_grid(g_act,g_pred)

#Checking overall Actual and predicted for Boys

weight_mean_boy<-a_unique_07_boys %>% group_by(measurement_date) %>% 
  dplyr::summarise(wt_mean=mean(weight_kg),pred_wt_mean=mean(pred_wt))

boy<-ggplot(weight_mean_boy)+
  geom_line(aes(x=measurement_date,y=wt_mean,color='blue'))+
  geom_line(aes(x=measurement_date,y=pred_wt_mean,color='red'))+
  scale_color_discrete(name = "Legends", labels = c( "Predicted","Actual"))

#Checking overall Actual and predicted for Girls

weight_mean_girl<-a_unique_07_girls %>% group_by(measurement_date) %>% 
  dplyr::summarise(wt_mean=mean(weight_kg),pred_wt_mean=mean(pred_wt))

girl<-ggplot(weight_mean_girl)+
  geom_line(aes(x=measurement_date,y=wt_mean,color='blue'))+
  geom_line(aes(x=measurement_date,y=pred_wt_mean,color='red'))+
  scale_color_discrete(name = "Legends", labels = c( "Predicted","Actual"))

#Comparing
par(mfrow=c(1,2), cex=.1)
plot_grid(boy, girl, labels = c("Weight of Boys","Weight of girl"))

BMI Analysis

#Boys BMI
bmi_b<-ggplot(a_unique_07_boys, aes(x=age_years, y=BMI, color=ID)) + geom_line() +
  theme(legend.position = "none") + ggtitle("BMI of all Boys ")
ggplotly(bmi_b)
#Girls BMI
bmi_g<-ggplot(a_unique_07_girls, aes(x=age_years, y=BMI, color=ID)) + geom_line() +
    theme(legend.position = "none") + ggtitle("BMI of all Girls ")
ggplotly(bmi_g)
plot_grid(bmi_b, bmi_g)

#Boys
model1_bmi_b <- bam(BMI ~ age_years,
          data=a_unique_07_boys)
summary(model1_bmi_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BMI ~ age_years
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.75218    0.41140   28.57   <2e-16 ***
## age_years    0.63764    0.03115   20.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.232   Deviance explained = 23.3%
## -REML = 3784.1  Scale est. = 13.988    n = 1381
gam.check(model1_bmi_b)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-0.0004105247,-0.0004105247]
## (score 3784.141 & scale 13.98764).
## Hessian positive definite, eigenvalue range [689.5004,689.5004].
## Model rank =  2 / 2

Random intercepts Effect

model2_bmi_b <- bam(BMI ~ age_years
          + s(ID, bs="re"),
          data=a_unique_07_boys)
#summary(model2_bmi_b)
model2_bmi_b_1<-getViz(model2_bmi_b)
print(plot(model2_bmi_b_1, allTerms = T), pages = 1)

gam.check(model2_bmi_b)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 8 iterations.
## Gradient range [-8.039312e-07,7.80885e-07]
## (score 2714.612 & scale 2.388269).
## Hessian positive definite, eigenvalue range [29.87295,691.0232].
## Model rank =  67 / 67 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##         k'  edf k-index p-value
## s(ID) 65.0 63.4      NA      NA

Random intercepts + slopes Effect

model3_bmi_b <- bam(BMI ~ age_years
          + s(ID, bs="re")
          + s(ID, age_years, bs="re"),
          data=a_unique_07_boys)
#summary(model3_bmi_b)
model3_bmi_b_1<-getViz(model3_bmi_b)
print(plot(model3_bmi_b_1, allTerms = T), pages = 1)

gam.check(model3_bmi_b)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 7 iterations.
## Gradient range [-1.22667e-07,9.565373e-08]
## (score 2411.887 & scale 1.281127).
## Hessian positive definite, eigenvalue range [23.69611,692.2592].
## Model rank =  132 / 132 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'  edf k-index p-value
## s(ID)           65.0 60.6      NA      NA
## s(ID,age_years) 65.0 60.6      NA      NA

Comparing Models

compareML(model2_bmi_b,model3_bmi_b)
## model2_bmi_b: BMI ~ age_years + s(ID, bs = "re")
## 
## model3_bmi_b: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##              Model    Score Edf   Chisq    Df  p.value Sig.
##       model2_bmi_b 2714.612   3                            
## fREML model3_bmi_b 2411.887   4 302.725 1.000  < 2e-16  ***
## 
## AIC difference: 806.10, model model3_bmi_b has lower AIC.

Random intercepts + slopes + smooths Effect

model4_bmi_b <- bam(BMI ~ age_years
          + s(ID, bs="re")
          + s(ID, age_years, bs="re")
          + s(age_years, ID, bs="fs", m=1),
          data=a_unique_07_boys)
#summary(model4_bmi_b)
model4_bmi_b_1<-getViz(model4_bmi_b)
print(plot(model4_bmi_b_1, allTerms = T), pages = 1)

Comparing Model3 and Model4

compareML(model3_bmi_b, model4_bmi_b)
## model3_bmi_b: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## model4_bmi_b: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re") + 
##  model3_bmi_b: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## model4_bmi_b:     s(age_years, ID, bs = "fs", m = 1)
## 
## Chi-square test of fREML scores
## -----
##              Model    Score Edf   Chisq    Df  p.value Sig.
##       model3_bmi_b 2411.887   4                            
## fREML model4_bmi_b 2111.616   6 300.270 2.000  < 2e-16  ***
## 
## AIC difference: 1064.88, model model4_bmi_b has lower AIC.

Checking Auto Correlation Function

par(mfrow=c(1,3), cex=1.1)
acf_resid(model2_bmi_b, split_pred="ID", main="ACF resid(model2)")
acf_resid(model3_bmi_b, split_pred="ID", main="ACF resid(model3)")
acf_resid(model4_bmi_b, split_pred="ID", main="ACF resid(model4)")

#Girls
model1_bmi_g <- bam(BMI ~ age_years,
          data=a_unique_07_girls)
summary(model1_bmi_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BMI ~ age_years
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.27666    0.39949   30.73   <2e-16 ***
## age_years    0.64182    0.03054   21.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.214   Deviance explained = 21.4%
## -REML =   4533  Scale est. = 15.664    n = 1621
gam.check(model1_bmi_g)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-2.082481e-08,-2.082481e-08]
## (score 4533.035 & scale 15.66368).
## Hessian positive definite, eigenvalue range [809.5,809.5].
## Model rank =  2 / 2

Random intercepts Effect

model2_bmi_g <- bam(BMI ~ age_years
          + s(ID, bs="re"),
          data=a_unique_07_girls)
#summary(model2_bmi_b)
model2_bmi_g_1<-getViz(model2_bmi_g)
print(plot(model2_bmi_g_1, allTerms = T), pages = 1)

gam.check(model2_bmi_g)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 8 iterations.
## Gradient range [-1.346668e-06,1.304211e-06]
## (score 3336.76 & scale 2.89105).
## Hessian positive definite, eigenvalue range [34.94545,811.2783].
## Model rank =  78 / 78 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##         k'  edf k-index p-value
## s(ID) 76.0 74.2      NA      NA

Random intercepts + slopes Effect

model3_bmi_g <- bam(BMI ~ age_years
          + s(ID, bs="re")
          + s(ID, age_years, bs="re"),
          data=a_unique_07_girls)
#summary(model3_bmi_b)
model3_bmi_g_1<-getViz(model3_bmi_g)
print(plot(model3_bmi_g_1, allTerms = T), pages = 1)

gam.check(model3_bmi_g)

## 
## Method: fREML   Optimizer: perf newton
## full convergence after 8 iterations.
## Gradient range [-1.0441e-09,9.817569e-11]
## (score 2981.451 & scale 1.558866).
## Hessian positive definite, eigenvalue range [27.48084,812.6948].
## Model rank =  154 / 154 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'  edf k-index p-value
## s(ID)           76.0 70.5      NA      NA
## s(ID,age_years) 76.0 70.9      NA      NA

Comparing Models

compareML(model2_bmi_g,model3_bmi_g)
## model2_bmi_g: BMI ~ age_years + s(ID, bs = "re")
## 
## model3_bmi_g: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## Chi-square test of fREML scores
## -----
##              Model    Score Edf   Chisq    Df  p.value Sig.
##       model2_bmi_g 3336.760   3                            
## fREML model3_bmi_g 2981.451   4 355.309 1.000  < 2e-16  ***
## 
## AIC difference: 938.51, model model3_bmi_g has lower AIC.

Random intercepts + slopes + smooths Effect

model4_bmi_g <- bam(BMI ~ age_years
          + s(ID, bs="re")
          + s(ID, age_years, bs="re")
          + s(age_years, ID, bs="fs", m=1),
          data=a_unique_07_girls)
#summary(model4_bmi_g)
model4_bmi_g_1<-getViz(model4_bmi_g)
print(plot(model4_bmi_g_1, allTerms = T), pages = 1)

compareML(model3_bmi_g, model4_bmi_g)
## model3_bmi_g: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## model4_bmi_g: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re") + 
##  model3_bmi_g: BMI ~ age_years + s(ID, bs = "re") + s(ID, age_years, bs = "re")
## 
## model4_bmi_g:     s(age_years, ID, bs = "fs", m = 1)
## 
## Chi-square test of fREML scores
## -----
##              Model    Score Edf   Chisq    Df  p.value Sig.
##       model3_bmi_g 2981.451   4                            
## fREML model4_bmi_g 2559.933   6 421.518 2.000  < 2e-16  ***
## 
## AIC difference: 1434.70, model model4_bmi_g has lower AIC.

Checking Auto Correlation Function

par(mfrow=c(1,3), cex=1.1)
acf_resid(model2_bmi_g, split_pred="ID", main="ACF resid(model2)")
acf_resid(model3_bmi_g, split_pred="ID", main="ACF resid(model3)")
acf_resid(model4_bmi_g, split_pred="ID", main="ACF resid(model4)")

#Boys 
#BMI Prediction using the model 
a_unique_07_boys$pred_bmi<-predict(model4_bmi_b,a_unique_07_boys, type='response')

#Comparing Actual and Predicted BMI for Boys
b_bmi_act<-ggplot(a_unique_07_boys,aes(x=measurement_date, y=BMI, group=ID))+
  geom_line()+ggtitle("Actual BMI (Boys)")

b_bmi_pred<-ggplot(a_unique_07_boys,aes(x=measurement_date, y=pred_bmi, group=ID))+
  geom_line()+ggtitle("Predicted BMI (Boys)")

plot_grid(b_bmi_act,b_bmi_pred)

#Girls 
#BMI Prediction using the model 
a_unique_07_girls$pred_bmi<-predict(model4_bmi_g,a_unique_07_girls, type='response')

#Comparing Actual and Predicted weight for Girls
g_bmi_act<-ggplot(a_unique_07_girls,aes(x=measurement_date, y=BMI, group=ID))+
  geom_line()+ggtitle("Actual BMI (Girls)")

g_bmi_pred<-ggplot(a_unique_07_girls,aes(x=measurement_date, y=pred_bmi, group=ID))+
  geom_line()+ggtitle("Predicted BMI (Girls)")

plot_grid(g_bmi_act,g_bmi_pred)

#Checking overall Actual and Predicted BMI for boys

bmi_mean_boy<-a_unique_07_boys %>% group_by(measurement_date) %>% 
  dplyr::summarise(bmi_mean=mean(BMI),pred_bmi_mean=mean(pred_bmi))

boy_bmi<-ggplot(bmi_mean_boy)+
  geom_line(aes(x=measurement_date,y=bmi_mean,color='blue'))+
  geom_line(aes(x=measurement_date,y=pred_bmi_mean,color='red'))+
  scale_color_discrete(name = "Legends", labels = c( "Predicted","Actual"))

#Checking overall Actual and predicted for Girls

bmi_mean_girl<-a_unique_07_girls %>% group_by(measurement_date) %>% 
  dplyr::summarise(bmi_mean=mean(BMI),pred_bmi_mean=mean(pred_bmi))

girl_bmi<-ggplot(bmi_mean_girl)+
  geom_line(aes(x=measurement_date,y=bmi_mean,color='blue'))+
  geom_line(aes(x=measurement_date,y=pred_bmi_mean,color='red'))+
  scale_color_discrete(name = "Legends", labels = c( "Predicted","Actual"))

#Comparing
par(mfrow=c(1,2), cex=.1)
plot_grid(boy_bmi, girl_bmi, labels = c("BMI of Boys","BMI of girl"))

#Lightweight and Least BMI kids in 2007 Analysis (Boys and Girls)

#Weight
weightl_07_boy <- a_07_18 %>% arrange(weight_kg) %>% filter((gender) %in% c("boy"))%>% head(5)
weightl_07_boy_IDs<-weightl_07_boy$ID

weightl_07_girl <- a_07_18 %>% arrange(weight_kg) %>% filter((gender) %in% c("girl")) %>% head(5)
weightl_07_girl_IDs<-weightl_07_girl$ID

#BMI
bmil_07_boy <- a_07_18 %>% arrange(BMI) %>% filter((gender) %in% c("boy"))%>% head(5)
bmil_07_boy_IDs<-bmil_07_boy$ID

bmil_07_girl <- a_07_18 %>% arrange(BMI) %>% filter((gender) %in% c("girl")) %>% head(5)
bmil_07_girl_IDs<-bmil_07_girl$ID
#Checking weight rate of the lightest kids from 2007 throught 2018
#BOY
light_boys<-a_unique_07_pct %>% filter((ID) %in% weightl_07_boy_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_wt, pct_change_ht, pct_change_bmi,pct_change_ht)

ggplot(light_boys)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
light_girls<-a_unique_07_pct %>% filter((ID) %in% weightl_07_girl_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_wt, pct_change_bmi,pct_change_ht)

ggplot(light_girls)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#Checking BMI rate of the lowest BMI kids from 2007 throught 2018
#BOY
lowb_boys<-a_unique_07_pct %>% filter((ID) %in% bmil_07_boy_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_bmi,pct_change_ht,pct_change_wt)

ggplot(lowb_boys)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
lowb_girls<-a_unique_07_pct %>% filter((ID) %in% bmil_07_girl_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_bmi,pct_change_ht, pct_change_wt)

ggplot(lowb_girls)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#Heavyweight and High BMI kids in 2007 Analysis (Boys and Girls)

#Weight
weighth_07_boy <- a_07_18 %>% arrange(desc(weight_kg)) %>% filter((gender) %in% c("boy"))%>% head(5)
weighth_07_boy_IDs<-weighth_07_boy$ID

weighth_07_girl <- a_07_18 %>% arrange(desc(weight_kg)) %>% filter((gender) %in% c("girl")) %>% head(5)
weighth_07_girl_IDs<-weighth_07_girl$ID

#BMI
bmih_07_boy <- a_07_18 %>% arrange(desc(BMI)) %>% filter((gender) %in% c("boy"))%>% head(5)
bmih_07_boy_IDs<-bmil_07_boy$ID

bmih_07_girl <- a_07_18 %>% arrange(desc(BMI)) %>% filter((gender) %in% c("girl")) %>% head(5)
bmih_07_girl_IDs<-bmih_07_girl$ID
#Checking Weight rate of the Heavy kids from 2007 throught 2018
#BOY
heavy_boys<-a_unique_07_pct %>% filter((ID) %in% weighth_07_boy_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_wt,pct_change_bmi,pct_change_ht)

ggplot(heavy_boys)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
heavy_girls<-a_unique_07_pct %>% filter((ID) %in% weighth_07_girl_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_wt,pct_change_bmi,pct_change_ht)

ggplot(heavy_girls)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#Checking BMI rate of the High BMI kids from 2007 throught 2018
#BOY
highb_boys<-a_unique_07_pct %>% filter((ID) %in% bmih_07_boy_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_bmi,pct_change_ht,pct_change_wt)

ggplot(highb_boys)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
highb_girls<-a_unique_07_pct %>% filter((ID) %in% bmih_07_girl_IDs) %>% 
  select(ID, measurement_date, weight_kg, pct_change_bmi,pct_change_ht,pct_change_wt)

ggplot(highb_girls)+
  geom_line(aes(x=measurement_date, y = pct_change_wt,col="% Weight Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_bmi,col="% BMI Change"))+
  geom_line(aes(x=measurement_date, y = pct_change_ht,col="% Height Change"))+
  facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).

en <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_exercise_normal.xlsx",na="NA")

#Dealing with NAs
en_na<-na.omit(en)

#Renaming Variables(Renaming Column Names for easy code handling and Cleaning Data)
names(en_na)[names(en_na)=='measurement date']<- 'measurement_date'
names(en_na)[names(en_na)=='age (years)']<- 'age_years'
names(en_na)[names(en_na)=='age bin']<- 'age_bin'
names(en_na)[length(en_na)]<-"DBP_10"
names(en_na)[9]<-"z_score_WHO"
names(en_na)[6]<-"height_cm"
names(en_na)[7]<-"weight_kg"
names(en_na)[10]<-"z_cat_WHO"
names(en_na)[11]<-"running_dist_m"
names(en_na)[12]<-"running_time_s"
names(en_na)[13]<-"running_speed"
names(en_na)[14]<-"pulse_0"
names(en_na)[15]<-"pulse_1"
names(en_na)[16]<-"pulse_5"
names(en_na)[17]<-"pulse_10"
names(en_na)[18]<-"SBP_0"
names(en_na)[19]<-"SBP_1"
names(en_na)[20]<-"SBP_5"
names(en_na)[21]<-"SBP_10"
names(en_na)[22]<-"DBP_0"
names(en_na)[23]<-"DBP_1"
names(en_na)[24]<-"DBP_5"

#Changing Data Types
en_na <- en_na %>%
  mutate(gender = as.factor(gender),
         z_cat_WHO=as.factor(z_cat_WHO),
         measurement_date=as.Date(measurement_date),
         BMI = as.numeric(BMI),
         z_score_WHO=as.numeric(z_score_WHO),
         height_cm=as.numeric(height_cm),
         weight_kg=as.numeric(weight_kg))

#Adding additional column
en_na$year <- year(en_na$measurement_date)
en_na$ID <- as.factor(en_na$ID)
# Checking Pulse, SBP, DBP for boys and girls
Pulse<-ggplot(en_na)+
  geom_col(aes(y=pulse_10, x=age_bin ,fill=gender), position='dodge')+
  ggtitle("Pulse for Boys and Girls")

SBP<-ggplot(en_na)+
  geom_col(aes(y=SBP_10, x=age_bin,fill=gender), position='dodge') +
  ggtitle("SBP for Boys and Girls")

DBP<-ggplot(en_na)+
  geom_col(aes(y=DBP_10, x=age_bin,fill=gender), position='dodge') +
  ggtitle("DBP for Boys and Girls")

#Comparing
par(mfrow=c(1,3), cex=.1)
plot_grid(Pulse, SBP, DBP)

#Grouping by ID, Gender and running distance
grp <- en_na %>% group_by(ID, gender, running_dist_m) %>% 
  dplyr::summarise(mean_SBP_0=mean(SBP_0),mean_SBP_1=mean(SBP_1),mean_SBP_5=mean(SBP_5),mean_SBP_10=mean(SBP_10),
                   mean_pulse_0=mean(pulse_0),mean_pulse_1=mean(pulse_1), mean_pulse_5=mean(pulse_5),
                   mean_pulse_10=mean(pulse_10), mean_DBP_0=mean(DBP_0), mean_DBP_1=mean(DBP_1),
                   mean_DBP_5=mean(DBP_5),mean_DBP_10=mean(DBP_10),
                   mean_running_speed=mean(running_speed))

#New dataset by mean
x<-grp %>%
  pivot_longer(mean_pulse_0:mean_pulse_10, names_to = "Pulse", values_to = "P_Count") %>%
  pivot_longer(mean_SBP_0:mean_SBP_10, names_to = "SBP", values_to = "SBP_Count") %>%
  pivot_longer(mean_DBP_0:mean_DBP_10, names_to = "DBP", values_to = "DBP_Count")

#Making factors and levelling
x$Pulse<-factor(x$Pulse,levels=c('mean_pulse_0','mean_pulse_1','mean_pulse_5','mean_pulse_10'))
x$SBP<-factor(x$SBP, levels = c('mean_SBP_0','mean_SBP_1','mean_SBP_5','mean_SBP_10'))
x$DBP<-factor(x$DBP,levels = c('mean_DBP_0','mean_DBP_1','mean_DBP_5','mean_DBP_10'))

P<-ggplot(x)+
  geom_col(aes(x=mean_running_speed, y=P_Count, color=Pulse),position='Dodge2')+
  facet_wrap(vars(gender, running_dist_m)) + transition_manual(Pulse,cumulative = TRUE)
P
## nframes and fps adjusted to match transition

S<-ggplot(data=x)+
  geom_col(aes(x=mean_running_speed, y=SBP_Count, color=SBP),position='Dodge')+
  facet_wrap(vars(gender, running_dist_m)) + transition_manual(SBP,cumulative = TRUE)

S
## nframes and fps adjusted to match transition

D<-ggplot(x)+
  geom_col(aes(x=mean_running_speed, y=DBP_Count, color=DBP),position='Dodge2')+
  facet_wrap(vars(gender, running_dist_m)) + transition_manual(DBP,cumulative = TRUE)

D
## nframes and fps adjusted to match transition

#And Compare both shortest and tallest
#check weight and BMI and heartrate as well


#Clustering of growths.
#Model Explaination if not used in the lectures.
#Weight analysis